Database:
1. All RCTs registered at WHO ICTRP by Jan 1st 2016,
2. with start date between 2006 and 2015
3. with study type and design corresponding to RCT
4. with at least one country location among the 187 countries included in the GBD2010 study
We will:
1. Create replicates of the mapping of RCTs across diseases
In [3]:
#Upload database: not included in the repository
data <- read.table("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/database_RCTs_regions_27diseases.txt")
names(data)
#Upload traduction names/label categories 27 groups of diseases
Mgbd <- read.table("Data/27_gbd_groups.txt")
For each disease, we simulate what would have been the mapping of RCTs within regions if the misclassification of RCTs towards groups of diseases was corrected, given the sensitivities and specificities of the classifier to identify each group of disease.
To estimate the performances of the classifier for each group of diseases, we dispose a test set with 2,763 trials manually classified towards the 27-class grouping of diseases used in this work. The test set is described at Atal et al. BMC Bioinformatics 2016.
The method used is based on the method presented at Fox et al. Int J Epidemiol 2005.
To do so, for each disease we will:
Replicated mappings are saved in a
In [4]:
Lgbd <- lapply(as.character(data$GBD27),function(x){as.numeric(unlist(strsplit(x,"&")))})
In [5]:
PERF <- read.csv('Tables/Performances_per_27disease_data.csv')
In [6]:
NK <- 10000
set.seed(7212)
In [7]:
#For all diseases
t0 <- proc.time()
g<- 0
PERF_g <- PERF[PERF$dis==0,]
dir.create(paste("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/Replicates/",as.character(PERF_g$dis),sep=""))
#which trials are relevant to the burden
is_dis <- sapply(Lgbd,length)==1
#PPV et NPVs for finding the disease
sens_r <- PERF_g$TP_Dis
sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
spec_r <- PERF_g$TN_Dis
spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)
write.table(data.frame(sens=sens,spec=spec),
paste(c("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/Replicates/",
as.character(PERF_g$dis),"/Sens_spec.txt"),collapse=""))
a_dis <- sum(is_dis)
b_dis <- N-a_dis
As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
Bs <- N-As
T1 <- sens*As
T0 <- spec*Bs
F1 <- (1-spec)*Bs
F0 <- (1-sens)*As
PPV_dis <- T1/(T1+F1)
NPV_dis <- T0/(T0+F0)
false_it <- PPV_dis<0 | PPV_dis>1 |
NPV_dis<0 | NPV_dis>1
print(paste(c(g,"has",sum(false_it),"suppressed false iterations"
),collapse=" "))
PPV_dis <- PPV_dis[!false_it]
NPV_dis <- NPV_dis[!false_it]
#Simulation: reclassifying each trial
for(k in 1:length(PPV_dis)){
tp_dis <- runif(a_dis)
tn_dis <- runif(b_dis)
recl_dis <- is_dis
recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE
write.table(data.frame(recl_dis=as.numeric(recl_dis)),
paste(c("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/Replicates/",
as.character(PERF_g$dis),"/Reclassification_",
"_",k,".txt"),collapse=""),row.names=FALSE)
}
t1 <- proc.time()-t0
t1/60
In [8]:
#For all diseases, we will simulate the mapping across regions of trials concerning
#the disease or concerning other diseases
dis <- 1:27
In [9]:
#For each disease
t0 <- proc.time()
for(g in dis){
PERF_g <- PERF[PERF$dis==g,]
dir.create(paste("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/Replicates/",as.character(PERF_g$dis),sep=""))
#which trials concern the disease
is_dis <- sapply(Lgbd,function(x){g%in%x})
#which trials concern another disease
is_oth <- sapply(Lgbd,function(x){sum(setdiff(1:27,g)%in%x)>0})
#PPV et NPVs for finding the disease
sens_r <- PERF_g$TP_Dis
sens_n <- PERF_g$TP_Dis + PERF_g$FN_Dis
spec_r <- PERF_g$TN_Dis
spec_n <- PERF_g$TN_Dis + PERF_g$FP_Dis
sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)
Dss <- data.frame(sens_dis=sens,spec_dis=spec)
a_dis <- sum(is_dis)
b_dis <- N-a_dis
As <- (a_dis-(1-spec)*N)/(sens - (1-spec))
Bs <- N-As
T1 <- sens*As
T0 <- spec*Bs
F1 <- (1-spec)*Bs
F0 <- (1-sens)*As
PPV_dis <- T1/(T1+F1)
NPV_dis <- T0/(T0+F0)
#PPV and NPVs for finding another disease
sens_r <- PERF_g$TP_Oth
sens_n <- PERF_g$TP_Oth + PERF_g$FN_Oth
spec_r <- PERF_g$TN_Oth
spec_n <- PERF_g$TN_Oth + PERF_g$FP_Oth
sens <- rbeta(NK,sens_r+1,sens_n-sens_r+1)
spec <- rbeta(NK,spec_r+1,spec_n-spec_r+1)
Dss$sens_oth <- sens
Dss$spec_oth <- spec
a_oth <- sum(is_oth)
b_oth <- N-a_oth
As <- (a_oth-(1-spec)*N)/(sens - (1-spec))
Bs <- N-As
T1 <- sens*As
T0 <- spec*Bs
F1 <- (1-spec)*Bs
F0 <- (1-sens)*As
PPV_oth <- T1/(T1+F1)
NPV_oth <- T0/(T0+F0)
write.table(Dss,
paste(c("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/Replicates/",
as.character(PERF_g$dis),"/Sens_spec.txt"),collapse=""))
#Some values of sens and spec may lead to impossible values of PPV or NPV (>1 or <0)
#We supress and count them. If the total of suppressed iterations is higher than 10% of total iterations we
#will modify the distributions for Specificity and Sensitivity
false_it <- PPV_dis<0 | PPV_dis>1 |
NPV_dis<0 | NPV_dis>1 |
PPV_oth<0 | PPV_oth>1 |
NPV_oth<0 | NPV_oth>1
print(paste(c(g,"has",sum(false_it),"suppressed false iterations"
),collapse=" "))
PPV_dis <- PPV_dis[!false_it]
NPV_dis <- NPV_dis[!false_it]
PPV_oth <- PPV_oth[!false_it]
NPV_oth <- NPV_oth[!false_it]
L <- list()
#Simulation: reclassifying each trial
for(k in 1:length(PPV_dis)){
tp_dis <- runif(a_dis)
tn_dis <- runif(b_dis)
recl_dis <- is_dis
recl_dis[recl_dis==TRUE][tp_dis>PPV_dis[k]] <- FALSE
recl_dis[recl_dis==FALSE][tn_dis>NPV_dis[k]] <- TRUE
rt <- as.numeric(recl_dis)
#Oth_dis
tp_oth <- runif(a_oth)
tn_oth <- runif(b_oth)
recl_oth <- is_oth
recl_oth[recl_oth==TRUE][tp_oth>PPV_oth[k]] <- FALSE
recl_oth[recl_oth==FALSE][tn_oth>NPV_oth[k]] <- TRUE
write.table(data.frame(recl_dis=as.numeric(recl_dis),recl_oth=as.numeric(recl_oth)),
paste(c("/media/igna/Elements/HotelDieu/Cochrane/MappingRCTs_vs_Burden/Replicates/",
as.character(PERF_g$dis),"/Reclassification_",
"_",k,".txt"),collapse=""),row.names=FALSE)
}
}
t1 <- proc.time()
print(t1-t0)/60
It took 13h
In [10]:
# Diseases with more than 10% of suppressed iterations:
Mgbd$cause_name[c(9,11,21,23,27)]
We will re-simulate only for diseases corresponding to more than 1% of local burden in a region